This file discribe each step of getting the read count data we used in last several week. Based on Gene2 Server.
Workstation one can found in https://github.com/HalforcNull/STATBioinformatics/blob/master/HW%205.Rmd
username="[Put Your Username of linux]"
NCBISRAToolkitUrl="https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-centos_linux64.tar.gz"
# Navigate to your nfs folder
cd /nfs/$username
# Download and rename sratoolkit zip package
wget $NCBISRAToolkitUrl -O sratoolkit.tar.gz
# Unzip
tar -zxf sratoolkit.tar.gz
# Rename the unzipped folder name
mv sratoolkit.2.9.2-centos_linux64 sratoolkit
nohup sratoolkit/bin/fastq-dump --gzip --split-files SRR493366 &
nohup sratoolkit/bin/fastq-dump --gzip --split-files SRR493367 &
nohup sratoolkit/bin/fastq-dump --gzip --split-files SRR493368 &
nohup sratoolkit/bin/fastq-dump --gzip --split-files SRR493369 &
nohup sratoolkit/bin/fastq-dump --gzip --split-files SRR493370 &
nohup sratoolkit/bin/fastq-dump --gzip --split-files SRR493371 &
The downloading process will take hours.
Gene2 contains fastqc module. So we can load and use it.
module load FastQC/0.11.7
nohup fastqc -q SRR493366_1.fastq.gz &
nohup fastqc -q SRR493366_2.fastq.gz &
nohup fastqc -q SRR493367_1.fastq.gz &
nohup fastqc -q SRR493367_2.fastq.gz &
nohup fastqc -q SRR493368_1.fastq.gz &
nohup fastqc -q SRR493368_2.fastq.gz &
nohup fastqc -q SRR493369_1.fastq.gz &
nohup fastqc -q SRR493369_2.fastq.gz &
nohup fastqc -q SRR493370_1.fastq.gz &
nohup fastqc -q SRR493370_2.fastq.gz &
nohup fastqc -q SRR493371_1.fastq.gz &
nohup fastqc -q SRR493371_2.fastq.gz &
# Download and rename sratoolkit zip package
wget https://github.com/pachterlab/kallisto/releases/download/v0.44.0/kallisto_linux-v0.44.0.tar.gz
# Unzip
tar -xzvf kallisto_linux-v0.44.0.tar.gz
# Rename the unzipped folder name
mv kallisto_linux-v0.44.0 kallisto
Since we are discussing human cancer, we are download cdna and ncrna of homo sapiens. After download, we combine them into one data file.
wget ftp://ftp.ensembl.org/pub/current_fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
wget ftp://ftp.ensembl.org/pub/current_fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz
cat Homo_sapiens.GRCh38.cdna.all.fa.gz Homo_sapiens.GRCh38.ncrna.fa.gz > Homo_sapiens.GRCh38.rna.fa.gz
nohup kallisto/kallisto index -i hsGRCh38_kallisto Homo_sapiens.GRCh38.rna.fa.gz &
-i hsGRCh38_kallisto : the index we just build
-t 4 : use 4 core
If this is single end read, we need setup the read length using: -l
nohup kallisto/kallisto quant -i hsGRCh38_kallisto -t 4 -o SRR493366 SRR493366_1.fastq.gz SRR493366_2.fastq.gz &
nohup kallisto/kallisto quant -i hsGRCh38_kallisto -t 4 -o SRR493367 SRR493367_1.fastq.gz SRR493367_2.fastq.gz &
nohup kallisto/kallisto quant -i hsGRCh38_kallisto -t 4 -o SRR493368 SRR493368_1.fastq.gz SRR493368_2.fastq.gz &
nohup kallisto/kallisto quant -i hsGRCh38_kallisto -t 4 -o SRR493369 SRR493369_1.fastq.gz SRR493369_2.fastq.gz &
nohup kallisto/kallisto quant -i hsGRCh38_kallisto -t 4 -o SRR493370 SRR493370_1.fastq.gz SRR493370_2.fastq.gz &
nohup kallisto/kallisto quant -i hsGRCh38_kallisto -t 4 -o SRR493371 SRR493371_1.fastq.gz SRR493371_2.fastq.gz &
library(EnsDb.Hsapiens.v86)
library(dplyr)
library('tximport')
esdb <- EnsDb.Hsapiens.v86
newtxs <- transcripts(esdb, return.type = 'data.frame')
k <- keys(esdb, keytype = "TXNAME")
tx2gene <- dplyr::select(newtxs, one_of(c('tx_name', 'gene_id')))
colnames(tx2gene) <- c('TXNAME', 'GENEID')
files <- c(
'SRR493366/abundance.tsv',
'SRR493367/abundance.tsv',
'SRR493368/abundance.tsv',
'SRR493369/abundance.tsv',
'SRR493370/abundance.tsv',
'SRR493371/abundance.tsv')
names(files) <- c('Scramble1','Scramble2','Scramble3','HOXA1KD1','HOXA1KD2','HOXA1KD3')
txi.kallisto.tsv <- tximport(files, type = "kallisto", tx2gene = tx2gene, ignoreAfterBar = TRUE, ignoreTxVersion=TRUE )
write.csv(txi.kallisto.tsv$counts, file="ReadCount(Kallisto).csv", row.names = TRUE)
library("edgeR")
## Loading required package: limma
library("plotly")
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
dat.kallisto <- read.csv('ReadCount(Kallisto).csv', row.names = 1)
dat.origin <- read.csv('GSE37704.csv', row.names = 1)
nrow(dat.kallisto)
## [1] 61950
nrow(dat.origin)
## [1] 19808
log.dat.kallisto <- cpm(dat.kallisto, log = TRUE )
log.dat.origin <- cpm(dat.origin, log = TRUE )
p <- plot_ly(type='scatter', mode='lines')
for( i in 1:6) {
tmp.density <- density(log.dat.kallisto[,i])
p <- p %>% add_lines(x = tmp.density$x, y = tmp.density$y)
}
p
p <- plot_ly(type='scatter', mode='lines')
for( i in 1:6) {
tmp.density <- density(log.dat.origin[,i])
p <- p %>% add_lines(x = tmp.density$x, y = tmp.density$y)
}
p
cut.log.kallisto <- log.dat.kallisto[rowSums(log.dat.kallisto) > -5, ]
cut.log.origin <- log.dat.origin[rowSums(log.dat.origin) > -5, ]
nrow(cut.log.kallisto)
## [1] 14680
nrow(cut.log.origin)
## [1] 12838
p <- plot_ly(type='scatter', mode='lines')
for( i in 1:6) {
tmp.density <- density(cut.log.kallisto[,i])
p <- p %>% add_lines(x = tmp.density$x, y = tmp.density$y)
}
p
p <- plot_ly(type='scatter', mode='lines')
for( i in 1:6) {
tmp.density <- density(cut.log.origin[,i])
p <- p %>% add_lines(x = tmp.density$x, y = tmp.density$y)
}
p